In [30]:
from pymongo import MongoClient
In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import numpy as np
In [31]:
import pandas as pd
In [32]:
client = MongoClient()
In [33]:
client = MongoClient('localhost', 27017)
In [34]:
db = client.airbnb
In [35]:
cursor = db.Listing.find()
In [37]:
data = pd.DataFrame(list(cursor))
In [88]:
data.head()
Out[88]:
In [91]:
df_dummies = data[['host_response_rate', 'host_is_superhost',
'host_total_listings_count', 'host_has_profile_pic',
'host_identity_verified', 'accommodates', 'bathrooms', 'bedrooms',
'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'instant_bookable',
'require_guest_profile_picture', 'require_guest_phone_verification',
'reviews_per_month', 'nearest_attr_dist', 'nearest_attr_rating',
'24-hour check-in', 'Family/kid friendly', 'Heating', 'Pets allowed',
'Internet', 'Smoking allowed', 'Suitable for events',
'Free parking on premises', 'Pool', 'Private entrance',
'Lock on bedroom door', 'Wheelchair accessible', 'TV',
'Indoor fireplace', 'Private living room', 'Pets live on this property',
'Elevator in building', 'Free parking on street',
'Paid parking off premises', 'Other pet(s)', 'Gym', 'Air conditioning',
'Kitchen', 'Cable TV', 'Breakfast', 'host_since_days',
'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60']]
In [92]:
dataframe = df_dummies.dropna()
In [93]:
df_dummies1 = dataframe[[
'accommodates', 'bathrooms', 'bedrooms',
'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
'instant_bookable','nearest_attr_dist', 'nearest_attr_rating',
'24-hour check-in', 'Family/kid friendly', 'Heating', 'Pets allowed',
'Internet', 'Smoking allowed', 'Suitable for events',
'Free parking on premises', 'Pool', 'Private entrance',
'Lock on bedroom door', 'Wheelchair accessible', 'TV',
'Indoor fireplace', 'Private living room', 'Pets live on this property',
'Elevator in building', 'Free parking on street',
'Paid parking off premises', 'Other pet(s)', 'Gym', 'Air conditioning',
'Kitchen', 'Cable TV', 'Breakfast',
'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60']]
#'host_response_rate', 'host_is_superhost','number_of_reviews''host_since_days'
In [95]:
y = dataframe["price"]
X = df_dummies1
In [96]:
lm = LinearRegression()
In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
In [98]:
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)
In [99]:
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())
In [100]:
%matplotlib inline
import pandas as pd
import numpy as np
import itertools
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
In [140]:
y = dataframe["price"]
X = df_dummies1
In [141]:
def processSubset(feature_set):
# Fit model on feature_set and calculate RSS
lm = LinearRegression()
lm.fit(X[feature_set], y)
#RSS = ((regr.predict(X[list(feature_set)]) - np.mean(y)) ** 2).sum()
r = lm.score(X[feature_set], y)
#adjr =regr.rsquared_adj
return {"Predictors": feature_set, "model":lm, "Rsquared":r}
In [142]:
def getBest(k, colsToKeep = [], colsToDrop = []):
tic = time.time()
newX = X.drop((colsToDrop+colsToKeep), axis=1)
results = []
for combo in itertools.combinations(newX, k):
results.append(processSubset(list(combo)+colsToKeep)) # Wrap everything up in a nice dataframe
models = pd.DataFrame(results)
# Choose the model with the highest RSS
best_model = models.loc[models["Rsquared"].argmax()]
toc = time.time()
#print(best_model["Predictors"])
print(best_model["Rsquared"])
#print(best_model["adjRsquared"])
print("Processed ", models.shape[0], "models on", k+len(colsToKeep), "predictors in", (toc-tic), "seconds.")
# Return the best model, along with some other useful information about the model
return best_model
In [143]:
def forward(criteria):
tic = time.time()
minchange = criteria
predictors = []
r = 0
best_model = {}
while len(predictors) < len(X.columns):
model = getBest(1,predictors, [])
modelr = model["Rsquared"]
if (modelr-r<criteria):
break
best_model = model
predictors = model["Predictors"]
r = modelr
toc = time.time()
print("Forward Selction: ", (toc-tic), "seconds")
return best_model
In [144]:
def backward(criteria):
# Pull out predictors we still need to process
tic = time.time()
minchange = criteria
lm = LinearRegression()
lm.fit(X, y)
r = lm.score(X, y)
predictors = X.columns
colsToDrop = []
best_model = {"Predictors": X.columns, "model":lm, "Rsquared":r}
while len(predictors) > 0:
model = getBest((len(predictors)-1),[],colsToDrop)
modelr = model["Rsquared"]
if (r-modelr>criteria):
break
best_model = model
colsToDrop.extend(list(set(predictors) - set(model["Predictors"])))
predictors = model["Predictors"]
r = modelr
toc = time.time()
print("Backward Selction: ", (toc-tic), "seconds")
return best_model
In [145]:
model0 = forward(0.00055)
In [146]:
model0["Predictors"]
Out[146]:
In [147]:
model1 = backward(0.00055)
In [148]:
model1["Predictors"]
Out[148]:
In [165]:
y = dataframe["price"]
In [166]:
feature_forward = dataframe[['neighbourhood_Sant Andreu',
'Elevator in building',
'Internet',
'Indoor fireplace',
'Family/kid friendly',
'neighbourhood_Gràcia',
'Gym',
'guests_included',
'extra_people',
'Air conditioning',
'neighbourhood_Sants-Montjuïc',
'Cable TV',
'Pool',
'nearest_attr_dist',
'roomtype_Entire home/apt',
'bathrooms',
'security_deposit',
'bedrooms',
'cleaning_fee',
'accommodates']]
In [167]:
X = feature_forward
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
In [168]:
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)
In [169]:
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())
In [170]:
feature_backward = dataframe[['accommodates',
'bathrooms',
'bedrooms',
'security_deposit',
'cleaning_fee',
'guests_included',
'extra_people',
'nearest_attr_dist',
'Family/kid friendly',
'Internet',
'Pool',
'Indoor fireplace',
'Elevator in building',
'Gym',
'Air conditioning',
'Cable TV',
'neighbourhood_Gràcia',
'neighbourhood_Sant Andreu',
'neighbourhood_Sants-Montjuïc',
'roomtype_Entire home/apt']]
In [171]:
X = feature_backward
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
In [172]:
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)
In [173]:
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())
In [174]:
import seaborn as sns
In [175]:
df_dummies = dataframe[['accommodates', 'bathrooms', 'bedrooms',
'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'instant_bookable',
'reviews_per_month', 'nearest_attr_dist', 'nearest_attr_rating',
'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60']]
In [176]:
X = df_dummies
In [177]:
sns.set_style("whitegrid")
In [178]:
ax = sns.boxplot(x=df_dummies["reviews_per_month"])
In [179]:
good = [x for x in df_dummies["reviews_per_month"] if x < 3]
In [180]:
best = [x for x in df_dummies["reviews_per_month"] if x >= 3]
In [181]:
df = dataframe[['require_guest_profile_picture', 'require_guest_phone_verification',
'host_total_listings_count', 'host_has_profile_pic',
'host_identity_verified','host_response_rate', 'host_is_superhost','accommodates', 'bathrooms', 'bedrooms',
'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'instant_bookable',
'nearest_attr_dist', 'nearest_attr_rating',
'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60',"reviews_per_month"]]
In [182]:
cat = pd.cut(df['reviews_per_month'], bins=(0, 3, 10), include_lowest=True,labels=[0, 1])
In [183]:
df['class_reviews'] = cat
In [184]:
low = df.loc[df['class_reviews'] == 0]
In [185]:
high = df.loc[df['class_reviews'] == 1]
In [188]:
from sklearn import tree
In [201]:
from sklearn.cross_validation import train_test_split
import statsmodels.api as sm
import sklearn.metrics
In [212]:
clf = tree.DecisionTreeClassifier() #15
In [226]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
parameter_grid = {'max_depth': [4,5,6,7,8],
"min_samples_leaf": [100],
"max_leaf_nodes": [6,7,8,9,10]}
cross_validation = StratifiedKFold(df.class_reviews, n_folds=10)
grid_search = GridSearchCV(clf,
param_grid=parameter_grid,
cv=cross_validation)
grid_search.fit(X, y)
print('Best parameters: {}'.format(grid_search.best_params_))
In [227]:
clf = tree.DecisionTreeClassifier(max_depth=4,min_samples_leaf=100,max_leaf_nodes=7) #15
In [228]:
X = df[['accommodates', 'bathrooms', 'bedrooms',
'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights',
'instant_bookable',
'nearest_attr_dist', 'nearest_attr_rating',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60']]
In [229]:
feature_names=['accommodates', 'bathrooms', 'bedrooms',
'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
'extra_people', 'minimum_nights', 'maximum_nights',
'instant_bookable',
'nearest_attr_dist', 'nearest_attr_rating',
'roomtype_Entire home/apt', 'roomtype_Private room',
'roomtype_Shared room', 'cancellation_policy_flexible',
'cancellation_policy_moderate', 'cancellation_policy_strict',
'cancellation_policy_super_strict_30',
'cancellation_policy_super_strict_60']
In [230]:
target_names=["low_popularity","high_popularity"]
In [231]:
y = df[["class_reviews"]]
In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [233]:
clf = clf.fit(X_train, y_train)
In [234]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
In [238]:
from IPython.display import Image
In [239]:
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("airbnb.pdf")
Out[239]:
In [240]:
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=feature_names,
class_names=target_names,
filled=True, rounded=True,
special_characters=True)
In [241]:
graph = pydotplus.graph_from_dot_data(dot_data)
In [242]:
Image(graph.create_png())
Out[242]:
In [243]:
# Decision Tree accuracy
clf.score(X_test, y_test)
Out[243]:
In [ ]:
In [ ]: